{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data loading, storage, and file formats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from __future__ import division\n",
    "from numpy.random import randn\n",
    "import numpy as np\n",
    "import os\n",
    "import sys\n",
    "import matplotlib.pyplot as plt\n",
    "np.random.seed(12345)\n",
    "plt.rc('figure', figsize=(10, 6))\n",
    "from pandas import Series, DataFrame\n",
    "import pandas as pd\n",
    "np.set_printoptions(precision=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "%pwd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Reading and Writing Data in Text Format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "!cat ch06/ex1.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "df = pd.read_csv('ch06/ex1.csv')\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "pd.read_table('ch06/ex1.csv', sep=',')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "!cat ch06/ex2.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "pd.read_csv('ch06/ex2.csv', header=None)\n",
    "pd.read_csv('ch06/ex2.csv', names=['a', 'b', 'c', 'd', 'message'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "names = ['a', 'b', 'c', 'd', 'message']\n",
    "pd.read_csv('ch06/ex2.csv', names=names, index_col='message')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "!cat ch06/csv_mindex.csv\n",
    "parsed = pd.read_csv('ch06/csv_mindex.csv', index_col=['key1', 'key2'])\n",
    "parsed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "list(open('ch06/ex3.txt'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "result = pd.read_table('ch06/ex3.txt', sep='\\s+')\n",
    "result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "!cat ch06/ex4.csv\n",
    "pd.read_csv('ch06/ex4.csv', skiprows=[0, 2, 3])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "!cat ch06/ex5.csv\n",
    "result = pd.read_csv('ch06/ex5.csv')\n",
    "result\n",
    "pd.isnull(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "result = pd.read_csv('ch06/ex5.csv', na_values=['NULL'])\n",
    "result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "sentinels = {'message': ['foo', 'NA'], 'something': ['two']}\n",
    "pd.read_csv('ch06/ex5.csv', na_values=sentinels)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Reading text files in pieces"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "result = pd.read_csv('ch06/ex6.csv')\n",
    "result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "pd.read_csv('ch06/ex6.csv', nrows=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "chunker = pd.read_csv('ch06/ex6.csv', chunksize=1000)\n",
    "chunker"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "chunker = pd.read_csv('ch06/ex6.csv', chunksize=1000)\n",
    "\n",
    "tot = Series([])\n",
    "for piece in chunker:\n",
    "    tot = tot.add(piece['key'].value_counts(), fill_value=0)\n",
    "\n",
    "tot = tot.order(ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "tot[:10]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Writing data out to text format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "data = pd.read_csv('ch06/ex5.csv')\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "data.to_csv('ch06/out.csv')\n",
    "!cat ch06/out.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "data.to_csv(sys.stdout, sep='|')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "data.to_csv(sys.stdout, na_rep='NULL')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "data.to_csv(sys.stdout, index=False, header=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "data.to_csv(sys.stdout, index=False, columns=['a', 'b', 'c'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "dates = pd.date_range('1/1/2000', periods=7)\n",
    "ts = Series(np.arange(7), index=dates)\n",
    "ts.to_csv('ch06/tseries.csv')\n",
    "!cat ch06/tseries.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "Series.from_csv('ch06/tseries.csv', parse_dates=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Manually working with delimited formats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "!cat ch06/ex7.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import csv\n",
    "f = open('ch06/ex7.csv')\n",
    "\n",
    "reader = csv.reader(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "for line in reader:\n",
    "    print(line)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "lines = list(csv.reader(open('ch06/ex7.csv')))\n",
    "header, values = lines[0], lines[1:]\n",
    "data_dict = {h: v for h, v in zip(header, zip(*values))}\n",
    "data_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "class my_dialect(csv.Dialect):\n",
    "    lineterminator = '\\n'\n",
    "    delimiter = ';'\n",
    "    quotechar = '\"'\n",
    "    quoting = csv.QUOTE_MINIMAL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "with open('mydata.csv', 'w') as f:\n",
    "    writer = csv.writer(f, dialect=my_dialect)\n",
    "    writer.writerow(('one', 'two', 'three'))\n",
    "    writer.writerow(('1', '2', '3'))\n",
    "    writer.writerow(('4', '5', '6'))\n",
    "    writer.writerow(('7', '8', '9'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "%cat mydata.csv"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### JSON data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "obj = \"\"\"\n",
    "{\"name\": \"Wes\",\n",
    " \"places_lived\": [\"United States\", \"Spain\", \"Germany\"],\n",
    " \"pet\": null,\n",
    " \"siblings\": [{\"name\": \"Scott\", \"age\": 25, \"pet\": \"Zuko\"},\n",
    "              {\"name\": \"Katie\", \"age\": 33, \"pet\": \"Cisco\"}]\n",
    "}\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import json\n",
    "result = json.loads(obj)\n",
    "result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "asjson = json.dumps(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "siblings = DataFrame(result['siblings'], columns=['name', 'age'])\n",
    "siblings"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### XML and HTML, Web scraping\n",
    "\n",
    "**NB. The Yahoo! Finance API has changed and this example no longer works**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from lxml.html import parse\n",
    "from urllib2 import urlopen\n",
    "\n",
    "parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options'))\n",
    "\n",
    "doc = parsed.getroot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "links = doc.findall('.//a')\n",
    "links[15:20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "lnk = links[28]\n",
    "lnk\n",
    "lnk.get('href')\n",
    "lnk.text_content()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "urls = [lnk.get('href') for lnk in doc.findall('.//a')]\n",
    "urls[-10:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "tables = doc.findall('.//table')\n",
    "calls = tables[9]\n",
    "puts = tables[13]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "rows = calls.findall('.//tr')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def _unpack(row, kind='td'):\n",
    "    elts = row.findall('.//%s' % kind)\n",
    "    return [val.text_content() for val in elts]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "_unpack(rows[0], kind='th')\n",
    "_unpack(rows[1], kind='td')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from pandas.io.parsers import TextParser\n",
    "\n",
    "def parse_options_data(table):\n",
    "    rows = table.findall('.//tr')\n",
    "    header = _unpack(rows[0], kind='th')\n",
    "    data = [_unpack(r) for r in rows[1:]]\n",
    "    return TextParser(data, names=header).get_chunk()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "call_data = parse_options_data(calls)\n",
    "put_data = parse_options_data(puts)\n",
    "call_data[:10]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Parsing XML with lxml.objectify"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "%cd ch06/mta_perf/Performance_XML_Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "!head -21 Performance_MNR.xml"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from lxml import objectify\n",
    "\n",
    "path = 'Performance_MNR.xml'\n",
    "parsed = objectify.parse(open(path))\n",
    "root = parsed.getroot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "data = []\n",
    "\n",
    "skip_fields = ['PARENT_SEQ', 'INDICATOR_SEQ',\n",
    "               'DESIRED_CHANGE', 'DECIMAL_PLACES']\n",
    "\n",
    "for elt in root.INDICATOR:\n",
    "    el_data = {}\n",
    "    for child in elt.getchildren():\n",
    "        if child.tag in skip_fields:\n",
    "            continue\n",
    "        el_data[child.tag] = child.pyval\n",
    "    data.append(el_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "perf = DataFrame(data)\n",
    "perf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "root"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "root.get('href')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "root.text"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Binary data formats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "cd ../.."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "frame = pd.read_csv('ch06/ex1.csv')\n",
    "frame\n",
    "frame.to_pickle('ch06/frame_pickle')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "pd.read_pickle('ch06/frame_pickle')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Using HDF5 format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "store = pd.HDFStore('mydata.h5')\n",
    "store['obj1'] = frame\n",
    "store['obj1_col'] = frame['a']\n",
    "store"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "store['obj1']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "store.close()\n",
    "os.remove('mydata.h5')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Interacting with HTML and Web APIs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import requests\n",
    "url = 'https://api.github.com/repos/pydata/pandas/milestones/28/labels'\n",
    "resp = requests.get(url)\n",
    "resp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "data[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "issue_labels = DataFrame(data)\n",
    "issue_labels"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Interacting with databases"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import sqlite3\n",
    "\n",
    "query = \"\"\"\n",
    "CREATE TABLE test\n",
    "(a VARCHAR(20), b VARCHAR(20),\n",
    " c REAL,        d INTEGER\n",
    ");\"\"\"\n",
    "\n",
    "con = sqlite3.connect(':memory:')\n",
    "con.execute(query)\n",
    "con.commit()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "data = [('Atlanta', 'Georgia', 1.25, 6),\n",
    "        ('Tallahassee', 'Florida', 2.6, 3),\n",
    "        ('Sacramento', 'California', 1.7, 5)]\n",
    "stmt = \"INSERT INTO test VALUES(?, ?, ?, ?)\"\n",
    "\n",
    "con.executemany(stmt, data)\n",
    "con.commit()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "cursor = con.execute('select * from test')\n",
    "rows = cursor.fetchall()\n",
    "rows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "cursor.description"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "DataFrame(rows, columns=zip(*cursor.description)[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pandas.io.sql as sql\n",
    "sql.read_sql('select * from test', con)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
